# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import logging
from typing import ClassVar
from typing import Optional

from typing_extensions import override

from ..utils.feature_decorator import experimental
from .eval_case import Invocation
from .eval_metrics import EvalMetric
from .eval_metrics import Interval
from .eval_metrics import MetricInfo
from .eval_metrics import MetricValueInfo
from .eval_metrics import PrebuiltMetrics
from .eval_metrics import RubricsBasedCriterion
from .llm_as_judge_utils import get_text_from_content
from .llm_as_judge_utils import get_tool_calls_and_responses_as_json_str
from .llm_as_judge_utils import get_tool_declarations_as_json_str
from .rubric_based_evaluator import RubricBasedEvaluator

logger = logging.getLogger("google_adk." + __name__)

_RUBRIC_BASED_TOOL_USE_QUALITY_V1_PROMPT = """# Mission
- Your mission is to evaluate the quality of responses generated by an AI agent. You will be presented with a user prompt (<user_prompt>), the agent's response (<response>) to that user prompt, and a set of properties (<property>) that you must use to objectively assess the validity of the agent's response.
- Only use the properties provided. Do not make up new properties.
- IMPORTANT: Assess all of the provided properties. Do not drop any of the properties from your response.
- The primary focus of this rating task is to check correctness of the agent's responses w.r.t. each of the properties.

# Rubric
"yes": The agent's response fulfilled the property or the property is not applicable to the response.
"no": The agent's response did not fulfill the property.

# For each property started with a new line, follow these steps:
STEP 1: Repeat the property, word for word, without making any changes. Keep everything including punctuation and capitalization as-is.
STEP 2: Determine the steps needed to **exactly**, **precisely** and **completely** determine whether the agent's response fulfilled the property.
STEP 3: Follow the steps outlined in STEP 2, thinking out loud.
STEP 4: Review the thoughts and the original property.
STEP 5: Output the final verdict.
Property: [[Repeat the property in STEP 1 again.]]
Rationale: [[Explain your reasoning for the verdict.]]
Verdict: [[yes|no]]

# Output format (repeat this format for every property started with a new line):
STEP 1: ...
STEP 2: ...
STEP 3: ...
STEP 4: ...
STEP 5: ...
Property: ...
Rationale: ...
Verdict: ...


# Example output 1

STEP 1: Does the agent run function call 'default_api.grammar_check'?
STEP 2: I need to check if the agent runs the function call with exact function name as 'default_api.grammar_check'.
STEP 3: The response includes a function call 'default_api.grammar_check'.
STEP 4: The function call format and the function name are correct.
STEP 5: yes
Property: Does the agent run function call 'default_api.grammar_check'?
Rationale: The agent's response contains the function call 'default_api.grammar_check' within a proper code block and with the correct function name.
Verdict: yes

STEP 1: Does the agent provide function call 'default_api.grammar_check' with input parameter 'sentence' that is valid compared to the reference 'sentence'= 'the dog walks on the a park' and based on the following guideline? Guideline for 'sentence': 'The wording can differ. The agent response is valid if it conveys similar core content as the reference response. Less efficient and minor inaccurate phrasing is acceptable. The default value is None, if the reference response includes this parameter with value equal to the default value but it is not provided in the agent response, then evaluate it as valid.'
STEP 2: I need to check if the function call 'default_api.grammar_check' includes the parameter 'sentence' and whether the value assigned to 'sentence' is valid according to the provided guideline. The reference value is 'the dog walks on the a park'. According to the guideline, the wording can differ as long as the core content is similar.
STEP 3: The agent's response includes the function call `default_api.grammar_check(sentence="the dog walks on the a park")`. The parameter 'sentence' is present, and the value assigned to it is "the dog walks on the a park", which is identical to the reference value.
STEP 4: The parameter 'sentence' is present and its value is exactly the same as the reference value.
STEP 5: yes
Property: Does the agent provide function call 'default_api.grammar_check' with input parameter 'sentence' that is valid compared to the reference 'sentence'= 'the dog walks on the a park' and based on the following guideline? Guideline for 'sentence': 'The wording can differ. The agent response is valid if it conveys similar core content as the reference response. Less efficient and minor inaccurate phrasing is acceptable. The default value is None, if the reference response includes this parameter with value equal to the default value but it is not provided in the agent response, then evaluate it as valid.'
Rationale: The agent's response includes the 'sentence' parameter in the function call 'default_api.grammar_check', and the value assigned to it is exactly the same as the reference value, thus satisfying the given guideline.
Verdict: yes

# Example output 2

STEP 1: Does the agent run function call 'default_api.search_via_perplexity'?
STEP 2: I need to check if the agent runs the function call with exact function name as 'default_api.search_via_perplexity'.
STEP 3: The response includes a function call `default_api.get_web_search_results`, which does not match 'default_api.search_via_perplexity'.
STEP 4: The function name does not match.
STEP 5: no
Property: Does the agent run function call 'default_api.search_via_perplexity'?
Rationale: The agent called 'default_api.get_web_search_results', not 'default_api.search_via_perplexity'.
Verdict: no

STEP 1: Does the agent provide function call 'default_api.search_via_perplexity' with input parameter 'keyword' that is valid compared to the reference 'keyword'= 'GPT-4o vs GPT-3.5 cost comparison' and based on the following guideline? Guideline for 'keyword': 'The wording can differ. The agent response is valid if it conveys similar core content as the reference response. Less efficient and minor inaccurate phrasing is acceptable.'
STEP 2: Since the previous property is no, this property is not applicable.
STEP 3: N/A
STEP 4: N/A
STEP 5: yes
Property: Does the agent provide function call 'default_api.search_via_perplexity' with input parameter 'keyword' that is valid compared to the reference 'keyword'= 'GPT-4o vs GPT-3.5 cost comparison' and based on the following guideline? Guideline for 'keyword': 'The wording can differ. The agent response is valid if it conveys similar core content as the reference response. Less efficient and minor inaccurate phrasing is acceptable.'
Rationale: The agent did not use the function call 'default_api.search_via_perplexity'.
Verdict: yes


# Available tools, user input, response and properties:
<available_tools>
{tool_declarations}
</available_tools>

<user_prompt>
{user_input}
</user_prompt>

<response>
{tool_usage}
</response>

<properties>
{rubrics}
</properties>

REMEMBER: Your answer will help improve the AI agent. It is important to determine the fulfillment of the properties correctly. Even answering "no" will improve the agent! Respond in pure text, not json.
IMPORTANT: Make sure for each of the property listed, follow the example steps and output "Property: ..." on a new line and "Verdict: ..." on another new line.
"""


@experimental
class RubricBasedToolUseV1Evaluator(RubricBasedEvaluator):
  """An Evaluator for rubric based assessment of the agent's usage of Tools.

  Example: Lets take an example of a Weather Agent that has access to two tools:
  1: GeoCoding Tool: Coverts a city name, address or zip code into geographic
  coordinates.
  2: GetWeather Tool: Gets weather for the next 10 days for the given geographic
  coordinates.

  For this agent, one can create following Rubrics that could focus on tool use

  Rubric 1: A call is made to GeoCoding Tool.
  Rubric 2: A call is made to GetWeather Tool.
  Rubric 3: The call to GetWeather Tool happens after the GeoCoding Tool.
  Rubric 4: The input to GeoCoding Tool can be mapped back to user prompt.
  Rubric 5: The input to GetWeather Tool comes from the output of GeoCoding
  Tool.)

  For each rubric, this evaluator will generate a confidence score between 0
  and 1, where 0 means that agent's response did not satisfy the rubric at all
  and 1 means complete adherence. Value closer to 1 are desirable.

  A combined score using individual rubric confidences will also be generated.
  Like individual rubric confidence scores, the range for this value will be
  between 0 and 1, and it will have the same interpretation.
  """

  criterion_type: ClassVar[type[RubricsBasedCriterion]] = RubricsBasedCriterion

  def __init__(self, eval_metric: EvalMetric):
    super().__init__(
        eval_metric,
        criterion_type=RubricBasedToolUseV1Evaluator.criterion_type,
    )
    self._auto_rater_prompt_template = _RUBRIC_BASED_TOOL_USE_QUALITY_V1_PROMPT

  @staticmethod
  def get_metric_info() -> MetricInfo:
    return MetricInfo(
        metric_name=PrebuiltMetrics.RUBRIC_BASED_TOOL_USE_QUALITY_V1.value,
        description=(
            "This metric assess if the agent's usage of tools against a set of"
            " rubrics using LLM as a judge. Value range for this metric is"
            " [0,1], with values closer to 1 more desirable."
        ),
        metric_value_info=MetricValueInfo(
            interval=Interval(min_value=0.0, max_value=1.0)
        ),
    )

  @override
  def format_auto_rater_prompt(
      self, actual_invocation: Invocation, _: Optional[Invocation]
  ) -> str:
    """Returns the autorater prompt."""

    user_input = get_text_from_content(actual_invocation.user_content)
    tool_usage = get_tool_calls_and_responses_as_json_str(
        actual_invocation.intermediate_data
    )
    rubrics = "\n*  ".join(
        [r.rubric_content.text_property for r in self._rubrics]
    )

    app_details = actual_invocation.app_details
    tool_declarations = "Agent has no tools."
    if app_details:
      tool_declarations = get_tool_declarations_as_json_str(app_details)

    return self._auto_rater_prompt_template.format(
        tool_declarations=tool_declarations,
        user_input=user_input,
        tool_usage=tool_usage,
        rubrics=rubrics,
    )
